{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def lm(x, y, data, intercept=True):\n", " \"\"\"Returns the coefficients from regressing y on x.\n", " \n", " Inputs:\n", " - x: a list containing the names of the x variables\n", " - y: the name of the y variable\n", " - data: a Pandas data frame (the names in x and y must be columns in this data frame)\n", " - intercept: boolean indicating whether or not to include an intercept term\n", " \n", " Outputs: A Pandas series with the estimated coefficients, indexed by the x variable names.\n", " \"\"\"\n", " \n", " # expand categorical variables into binary variables\n", " new_cols = []\n", " for col in x:\n", " # if it's a categorical, expand it using pd.get_dummies()\n", " if data[col].dtype == object:\n", " new_cols.append(pd.get_dummies(data[[col]], drop_first=True))\n", " # otherwise, just append the variable as is\n", " else:\n", " new_cols.append(data[[col]])\n", " X = pd.concat(new_cols, axis=1)\n", " \n", " print(np.linalg.cond(np.dot(X.T, X)))\n", " \n", " Y = data[y]\n", " \n", " if intercept:\n", " names = [\"Intercept\"] + list(X.columns)\n", " ones = pd.Series(1, index=data.index)\n", " X = pd.concat([ones, X], axis=1)\n", " else:\n", " names = list(X.columns)\n", " \n", " beta = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, Y))\n", " \n", " return pd.Series(data=beta, index=names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Some Data To Test Your Code" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "predictors = [\"symboling\", \"normalized-losses\", \"make\", \"fuel-type\",\n", " \"aspiration\", \"num-of-doors\", \"body-style\", \"drive-wheels\",\n", " \"engine-location\", \"wheel-base\", \"length\", \"width\",\n", " \"height\", \"curb-weight\", \"engine-type\", \"num-of-cylinders\",\n", " \"engine-size\", \"fuel-system\", \"bore\", \"stroke\",\n", " \"compression-ratio\", \"horsepower\", \"peak-rpm\", \"city-mpg\",\n", " \"highway-mpg\"]\n", "data = pd.read_csv(\"http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None,\n", " names=predictors + [\"price\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following code strips out missing values (represented by \"?\" in this data set) and converts columns to numeric types before fitting linear regression to the data." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(205, 26)\n", "(159, 26)\n" ] } ], "source": [ "print(data.shape)\n", "\n", "for col in data.columns:\n", " if data[col].dtype == object:\n", " data = data[data[col] != \"?\"]\n", " try:\n", " data[col] = pd.to_numeric(data[col])\n", " except:\n", " pass\n", " \n", "print(data.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 1: Quantitative Predictors Only\n", "\n", "Let's test out the `lm` function you just wrote on some quantitative predictors." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "110.72172389\n" ] }, { "data": { "text/plain": [ "Intercept -131136.766862\n", "length 122.591338\n", "width 1997.837168\n", "height -178.613267\n", "dtype: float64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm([\"length\", \"width\", \"height\"], \"price\", data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(-131136.76686224531, array([ 122.59133841, 1997.83716768, -178.61326723]))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LinearRegression()\n", "model.fit(data[[\"length\", \"width\", \"height\"]], data[\"price\"])\n", "model.intercept_, model.coef_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 2: Categorical Predictors\n", "\n", "Your `lm` function should also do the right thing for categorical variables automatically (i.e., it should expand categorical variables with $k$ levels into $k-1$ 0-1 variables automatically)." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.60303579656e+19\n" ] }, { "data": { "text/plain": [ "Intercept 101623.946789\n", "symboling -5.067821\n", "normalized-losses 5.577457\n", "make_bmw 359.610692\n", "make_chevrolet -4745.387730\n", "make_dodge -6209.191496\n", "make_honda -1582.712461\n", "make_jaguar 2430.797572\n", "make_mazda -4062.659445\n", "make_mercedes-benz 2548.338064\n", "make_mitsubishi -6327.595985\n", "make_nissan -3689.681752\n", "make_peugot 68443.249796\n", "make_plymouth -6024.951373\n", "make_porsche 4830.408327\n", "make_saab -404.038771\n", "make_subaru -58336.174774\n", "make_toyota -5869.407539\n", "make_volkswagen -4297.346425\n", "make_volvo -2871.342061\n", "fuel-type_gas -89180.187383\n", "aspiration_turbo 2171.490389\n", "num-of-doors_two -838.068778\n", "body-style_hardtop -5626.870389\n", "body-style_hatchback -5735.987637\n", "body-style_sedan -5702.431171\n", "body-style_wagon -5647.437732\n", "drive-wheels_fwd -29.356867\n", "drive-wheels_rwd 1977.187880\n", "wheel-base 318.440516\n", "length -76.626594\n", "width 243.692078\n", "height -335.218743\n", "curb-weight 5.208202\n", "engine-type_l -78263.126984\n", "engine-type_ohc -1913.256121\n", "engine-type_ohcf 51019.479134\n", "engine-type_ohcv -1337.149854\n", "num-of-cylinders_five -4108.064456\n", "num-of-cylinders_four -4688.467080\n", "num-of-cylinders_six -2976.251738\n", "num-of-cylinders_three 73586.269679\n", "engine-size -12.438327\n", "fuel-system_2bbl 2069.509402\n", "fuel-system_idi -78533.441448\n", "fuel-system_mfi 3467.897329\n", "fuel-system_mpfi 2601.780297\n", "fuel-system_spdi 1080.926758\n", "bore -881.685811\n", "stroke -567.659597\n", "compression-ratio -700.029130\n", "horsepower -20.192172\n", "peak-rpm -0.537667\n", "city-mpg -156.388896\n", "highway-mpg 128.416202\n", "dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coefs1 = lm(predictors, \"price\", data)\n", "coefs1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "symboling -5.067821\n", "normalized-losses 5.577457\n", "wheel-base 318.440516\n", "length -76.626594\n", "width 243.692078\n", "height -335.218743\n", "curb-weight 5.208202\n", "engine-size -12.438327\n", "bore -881.685811\n", "stroke -567.659597\n", "compression-ratio -700.029130\n", "horsepower -20.192172\n", "peak-rpm -0.537667\n", "city-mpg -156.388896\n", "highway-mpg 128.416202\n", "make_bmw 359.610692\n", "make_chevrolet -4745.387730\n", "make_dodge -6209.191496\n", "make_honda -1582.712461\n", "make_jaguar 2430.797572\n", "make_mazda -4062.659445\n", "make_mercedes-benz 2548.338064\n", "make_mitsubishi -6327.595985\n", "make_nissan -3689.681752\n", "make_peugot -4987.632357\n", "make_plymouth -6024.951373\n", "make_porsche 4830.408327\n", "make_saab -404.038771\n", "make_subaru -3658.347820\n", "make_toyota -5869.407539\n", "make_volkswagen -4297.346425\n", "make_volvo -2871.342061\n", "fuel-type_gas -5323.372967\n", "aspiration_turbo 2171.490389\n", "num-of-doors_two -838.068778\n", "body-style_hardtop -5626.870389\n", "body-style_hatchback -5735.987637\n", "body-style_sedan -5702.431171\n", "body-style_wagon -5647.437732\n", "drive-wheels_fwd -29.356867\n", "drive-wheels_rwd 1977.187880\n", "engine-type_l -4832.244831\n", "engine-type_ohc -1913.256121\n", "engine-type_ohcf -3658.347820\n", "engine-type_ohcv -1337.149854\n", "num-of-cylinders_five -4108.064456\n", "num-of-cylinders_four -4688.467080\n", "num-of-cylinders_six -2976.251738\n", "num-of-cylinders_three 155.387526\n", "fuel-system_2bbl 2069.509402\n", "fuel-system_idi 5323.372967\n", "fuel-system_mfi 3467.897329\n", "fuel-system_mpfi 2601.780297\n", "fuel-system_spdi 1080.926758\n", "Intercept 17767.132374\n", "dtype: float64" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LinearRegression()\n", "data_expanded = pd.get_dummies(data[predictors], drop_first=True)\n", "model.fit(data_expanded, data[\"price\"])\n", "coefs2 = pd.Series(model.coef_, index=data_expanded.columns)\n", "coefs2[\"Intercept\"] = model.intercept_\n", "coefs2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To debug why the intercepts are different (but most of the coefficients seem correct), we have to compare the coefficients from our `lm` function and scikit-learn. It's pretty hard to eyeball it because there are so many coefficients. Let's join the two to each other." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "
---|---|---|
Intercept | \n", "101623.946789 | \n", "17767.132374 | \n", "
aspiration_turbo | \n", "2171.490389 | \n", "2171.490389 | \n", "
body-style_hardtop | \n", "-5626.870389 | \n", "-5626.870389 | \n", "
body-style_hatchback | \n", "-5735.987637 | \n", "-5735.987637 | \n", "
body-style_sedan | \n", "-5702.431171 | \n", "-5702.431171 | \n", "
body-style_wagon | \n", "-5647.437732 | \n", "-5647.437732 | \n", "
bore | \n", "-881.685811 | \n", "-881.685811 | \n", "
city-mpg | \n", "-156.388896 | \n", "-156.388896 | \n", "
compression-ratio | \n", "-700.029130 | \n", "-700.029130 | \n", "
curb-weight | \n", "5.208202 | \n", "5.208202 | \n", "
drive-wheels_fwd | \n", "-29.356867 | \n", "-29.356867 | \n", "
drive-wheels_rwd | \n", "1977.187880 | \n", "1977.187880 | \n", "
engine-size | \n", "-12.438327 | \n", "-12.438327 | \n", "
engine-type_l | \n", "-78263.126984 | \n", "-4832.244831 | \n", "
engine-type_ohc | \n", "-1913.256121 | \n", "-1913.256121 | \n", "
engine-type_ohcf | \n", "51019.479134 | \n", "-3658.347820 | \n", "
engine-type_ohcv | \n", "-1337.149854 | \n", "-1337.149854 | \n", "
fuel-system_2bbl | \n", "2069.509402 | \n", "2069.509402 | \n", "
fuel-system_idi | \n", "-78533.441448 | \n", "5323.372967 | \n", "
fuel-system_mfi | \n", "3467.897329 | \n", "3467.897329 | \n", "
fuel-system_mpfi | \n", "2601.780297 | \n", "2601.780297 | \n", "
fuel-system_spdi | \n", "1080.926758 | \n", "1080.926758 | \n", "
fuel-type_gas | \n", "-89180.187383 | \n", "-5323.372967 | \n", "
height | \n", "-335.218743 | \n", "-335.218743 | \n", "
highway-mpg | \n", "128.416202 | \n", "128.416202 | \n", "
horsepower | \n", "-20.192172 | \n", "-20.192172 | \n", "
length | \n", "-76.626594 | \n", "-76.626594 | \n", "
make_bmw | \n", "359.610692 | \n", "359.610692 | \n", "
make_chevrolet | \n", "-4745.387730 | \n", "-4745.387730 | \n", "
make_dodge | \n", "-6209.191496 | \n", "-6209.191496 | \n", "
make_honda | \n", "-1582.712461 | \n", "-1582.712461 | \n", "
make_jaguar | \n", "2430.797572 | \n", "2430.797572 | \n", "
make_mazda | \n", "-4062.659445 | \n", "-4062.659445 | \n", "
make_mercedes-benz | \n", "2548.338064 | \n", "2548.338064 | \n", "
make_mitsubishi | \n", "-6327.595985 | \n", "-6327.595985 | \n", "
make_nissan | \n", "-3689.681752 | \n", "-3689.681752 | \n", "
make_peugot | \n", "68443.249796 | \n", "-4987.632357 | \n", "
make_plymouth | \n", "-6024.951373 | \n", "-6024.951373 | \n", "
make_porsche | \n", "4830.408327 | \n", "4830.408327 | \n", "
make_saab | \n", "-404.038771 | \n", "-404.038771 | \n", "
make_subaru | \n", "-58336.174774 | \n", "-3658.347820 | \n", "
make_toyota | \n", "-5869.407539 | \n", "-5869.407539 | \n", "
make_volkswagen | \n", "-4297.346425 | \n", "-4297.346425 | \n", "
make_volvo | \n", "-2871.342061 | \n", "-2871.342061 | \n", "
normalized-losses | \n", "5.577457 | \n", "5.577457 | \n", "
num-of-cylinders_five | \n", "-4108.064456 | \n", "-4108.064456 | \n", "
num-of-cylinders_four | \n", "-4688.467080 | \n", "-4688.467080 | \n", "
num-of-cylinders_six | \n", "-2976.251738 | \n", "-2976.251738 | \n", "
num-of-cylinders_three | \n", "73586.269679 | \n", "155.387526 | \n", "
num-of-doors_two | \n", "-838.068778 | \n", "-838.068778 | \n", "
peak-rpm | \n", "-0.537667 | \n", "-0.537667 | \n", "
stroke | \n", "-567.659597 | \n", "-567.659597 | \n", "
symboling | \n", "-5.067821 | \n", "-5.067821 | \n", "
wheel-base | \n", "318.440516 | \n", "318.440516 | \n", "
width | \n", "243.692078 | \n", "243.692078 | \n", "